library(tidyverse)
## ── Attaching packages ─────────────────────────────────────── tidyverse 1.3.1 ──
## ✓ ggplot2 3.3.5     ✓ purrr   0.3.4
## ✓ tibble  3.1.4     ✓ dplyr   1.0.7
## ✓ tidyr   1.1.3     ✓ stringr 1.4.0
## ✓ readr   2.0.1     ✓ forcats 0.5.1
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## x dplyr::filter() masks stats::filter()
## x dplyr::lag()    masks stats::lag()
library(viridis)
## Loading required package: viridisLite
library(ggridges)
library(patchwork)
library(ggplot2)
library(scales)
## 
## Attaching package: 'scales'
## The following object is masked from 'package:viridis':
## 
##     viridis_pal
## The following object is masked from 'package:purrr':
## 
##     discard
## The following object is masked from 'package:readr':
## 
##     col_factor
library(dbplyr)
## 
## Attaching package: 'dbplyr'
## The following objects are masked from 'package:dplyr':
## 
##     ident, sql
library(plotly)
## 
## Attaching package: 'plotly'
## The following object is masked from 'package:ggplot2':
## 
##     last_plot
## The following object is masked from 'package:stats':
## 
##     filter
## The following object is masked from 'package:graphics':
## 
##     layout
years_1 <- c(1900:2012, 2014)
years_2 <- c(2015:2019)

importing_data = function(x){
 
  if(str_detect(x, str_c(years_1, collapse = "|"))) {
  read_csv(x, na = c("NULL", "", "0"), col_types = "cicccciiiicc") 
  } 
  
  else if(str_detect(x, str_c(years_2, collapse = "|"))){
    read_csv(x, na = c("NULL", "", "0"), col_types = "cccicccccccccccccccccciiiiccc")
  }
}

boston_df <- 
  tibble(list.files("data", full.names = TRUE)) %>% 
  setNames("file_name") %>% 
  mutate(data = map(file_name, importing_data)) %>% 
  unnest(data) %>% 
  mutate(year = readr::parse_number(file_name),
         city = coalesce(city, residence),
         display_name = str_replace_all(display_name, "[^a-zA-Z0-9]", " ")) %>% 
  filter(!is.na(display_name)) %>% 
  select(-file_name, -residence, -first_name, -last_name)

knitr::opts_chunk$set(
  fig.width = 6,
  fig.asp = .6,
  out.width = "90%"
)
theme_set(theme_minimal() + theme(legend.position = "bottom"))
options(
  ggplot2.continuous.colour = "viridis",
  ggplot2.continuous.fill = "viridis"
)
scale_colour_discrete = scale_colour_viridis_d
scale_fill_discrete = scale_fill_viridis_d

Prepare age data

boston_df_age_plot =
  boston_df %>% 
  select(age) %>%
  drop_na(age)

Age distribution overall (counts)

This is plotly:

age_plotly_original = 
boston_df_age_plot %>% 
  count(age) %>% 
 plot_ly(
      x = ~age, y = ~n, color = ~age,
      type = "bar", colors = "viridis") %>% 
  layout(
    xaxis = list(title = "Age"),
    yaxis = list(title = "Number of Participants")
)

This is plain ggplot:

age_ggplot = 
ggplot(boston_df_age_plot, aes(x = age)) +
  geom_bar(fill = "cornflowerblue", 
                 color = "white") +
  labs(title = "Participants by age", 
       subtitle = "binwidth = 5 years",
       x = "Age", y = "Number of participants")

This is plotly from ggplot:

ggplotly(age_ggplot)

Histogram of percentages by age

age_percentage_ggplot = 
  ggplot(boston_df_age_plot, 
       aes(x = age, 
           y = ..count.. / sum(..count..))) +
  geom_histogram(fill = "cornflowerblue", 
                 color = "white", 
                 binwidth = 5) + 
  labs(title = "Participants by age", 
       y = "Percent",
       x = "Age") +
  scale_y_continuous(labels = percent)

plotly:

ggplotly(age_percentage_ggplot)

Density plot ages

density_plot = 
boston_df_age_plot %>%
  ggplot(aes(x = age)) +
    geom_density(fill = "cornflowerblue", color = "black", alpha = 0.8) +
    ggtitle("Distribution of age")
ggplotly(density_plot)

DATA CLEAN GENDER CODE

boston_df_gender =
    boston_df %>%
    select(gender) %>%
    mutate(gender = na_if(gender, "U"), 
            gender = recode(gender, m = "M"),
           gender = factor(gender, levels = c("M", "F"), labels = c("male", "female"))) %>% 
  drop_na(gender) %>% 
  count(gender)

Participants by gender (%’s)

bar_graph_gender = boston_df_gender %>%
  mutate(pct = n / sum(n),
         pctlabel = paste0(round(pct*100), "%"))

# plot the bars as percentages, 
# in decending order with bar labels
ggplot(bar_graph_gender, 
       aes(x = reorder(gender, -pct),
           y = pct)) + 
  geom_bar(stat = "identity", 
           fill = "indianred3", 
           color = "black") +
  geom_text(aes(label = pctlabel), 
            vjust = -0.25) +
  scale_y_continuous(labels = percent) +
  labs(x = "Gender", 
       y = "Percent", 
       title  = "Participants by gender")

Participants by gender (counts)

bar_graph_frequency = 
ggplot(bar_graph_gender, 
       aes(x = gender, 
           y = n)) + 
  geom_bar(stat = "identity", 
           fill = "indianred3",
           color = "black") +
  geom_text(aes(label = n), 
            vjust = -0.5) +
 scale_y_continuous(labels = scales::comma) +
  labs(x = "Gender", 
       y = "Frequency", 
       title  = "Participants by gender")

Prepare age change over time data

boston_df_gender_time_plot =
    boston_df %>%
    select(year, gender) %>%
    mutate(gender = na_if(gender, "U"), 
            gender = recode(gender, m = "M"),
           gender = factor(gender, levels = c("M", "F"), labels = c("male", "female"))) %>% 
  drop_na(gender) %>%
  drop_na(year) %>%
  group_by(year, gender) %>%
  count()

Gender distribution over time

gender_time_plot =
ggplot(boston_df_gender_time_plot, aes(x = year, y = n)) + 
  geom_line(aes(color = gender), size = 1) + xlim(1960, 2019) + labs(x = "year", 
       y = "participants", 
       title  = "Participants over time by gender")
ggplotly(gender_time_plot)